# For manipulating the datasets
library(dplyr)
library(readr)
# For plotting correlation matrix
library(ggcorrplot)
# Machine Learning library
library(caret)
# For Multi-core processing support
library(doMC)
# Use 3 cores
registerDoMC(cores=3)
THE WINE QUALITY DATASET
The two datasets are related to red and white variants of the Portuguese “Vinho Verde” wine. For more details, consult the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).
These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are much more normal wines than excellent or poor ones).
Available at https://archive.ics.uci.edu/ml/datasets/wine+quality
winedataset_blanco <- read_csv("data/blanco_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
winedataset_red <- read_csv("data/tinto_train.csv.gz")
Parsed with column specification:
cols(
`fixed acidity` = col_double(),
`volatile acidity` = col_double(),
`citric acid` = col_double(),
`residual sugar` = col_double(),
chlorides = col_double(),
`free sulfur dioxide` = col_double(),
`total sulfur dioxide` = col_double(),
density = col_double(),
pH = col_double(),
sulphates = col_double(),
alcohol = col_double(),
quality = col_integer()
)
# Create a new feature for the type
winedataset_blanco$type="white"
winedataset_red$type="red"
# Merge both datasets into one.
winedataset<-rbind(winedataset_blanco,winedataset_red)
# Print the dataset
winedataset
#winedataset %>% map(is.null)
winedataset %>% group_by(quality) %>% summarise(total=n())
winedataset %>% group_by(`total sulfur dioxide`,quality) %>% summarise(total=n())
reshape2::melt(winedataset) %>%
ggplot()+
geom_boxplot(aes(x=variable,y=value,fill=variable))+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position = "none")
Using type as id variables
plotly::ggplotly()
#Matriz de correlacion
cor_matrix<-cor(winedataset %>% select(-type))
ggcorrplot(cor_matrix)
pairs(winedataset %>% select(-type))
trainset <- winedataset
trainset$quality <- as.factor(trainset$quality)
trainset <- trainset %>% select(-type)
names(trainset) %>% as.data.frame()
trainIndex <- createDataPartition(as.factor(trainset$quality), p=0.80, list=FALSE)
data_train <- trainset[ trainIndex,]
data_test <- trainset[-trainIndex,]
colnames(data_train) <- make.names(colnames(data_train))
colnames(data_test) <- make.names(colnames(data_test))
data_train %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()
data_test %>% group_by(quality) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=quality,y=total,fill=quality))+
theme_classic()
ctrl_fast <- trainControl(method="cv",
repeats=1,
number=5,
# summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=F,
allowParallel = TRUE)
`repeats` has no meaning for this resampling method.
rfFitupsam$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 33.34%
Confusion matrix:
3 4 5 6 7 8 9 class.error
3 0 1 14 7 0 0 0 1.0000000
4 1 18 76 43 2 0 0 0.8714286
5 0 2 955 404 3 0 0 0.2998534
6 0 2 302 1415 95 0 0 0.2199559
7 0 0 16 322 351 3 0 0.4927746
8 0 0 1 55 34 34 0 0.7258065
9 0 0 0 3 1 0 0 1.0000000
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)
predsrfprobsamp=predict(rfFitupsam,data_test)
# use for regresion
#confusionMatrix(as.factor(predsrfprobsamp %>% round()),as.factor(data_test$quality))
confusionMatrix(predsrfprobsamp,as.factor(data_test$quality))
Confusion Matrix and Statistics
Reference
Prediction 3 4 5 6 7 8 9
3 0 1 0 0 0 0 0
4 0 6 6 0 0 0 0
5 3 15 239 89 5 0 0
6 1 10 94 324 69 9 0
7 1 2 2 39 99 13 1
8 0 0 0 1 0 8 0
9 0 0 0 0 0 0 0
Overall Statistics
Accuracy : 0.6519
95% CI : (0.622, 0.6809)
No Information Rate : 0.4368
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4638
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
Sensitivity 0.0000000 0.176471 0.7009 0.7152 0.57225 0.266667 0.0000000
Specificity 0.9990310 0.994018 0.8391 0.6866 0.93287 0.999007 1.0000000
Pos Pred Value 0.0000000 0.500000 0.6809 0.6391 0.63057 0.888889 NaN
Neg Pred Value 0.9951737 0.972683 0.8513 0.7566 0.91591 0.978599 0.9990357
Prevalence 0.0048216 0.032787 0.3288 0.4368 0.16683 0.028930 0.0009643
Detection Rate 0.0000000 0.005786 0.2305 0.3124 0.09547 0.007715 0.0000000
Detection Prevalence 0.0009643 0.011572 0.3385 0.4889 0.15140 0.008679 0.0000000
Balanced Accuracy 0.4995155 0.585244 0.7700 0.7009 0.75256 0.632837 0.5000000
#confusionmat <- table(predsrfprobsamp %>% round(),as.factor(data_test$quality))
confusionmat <- table(predsrfprobsamp,as.factor(data_test$quality))
confusionmat
predsrfprobsamp 3 4 5 6 7 8 9
3 0 1 0 0 0 0 0
4 0 6 6 0 0 0 0
5 3 15 239 89 5 0 0
6 1 10 94 324 69 9 0
7 1 2 2 39 99 13 1
8 0 0 0 1 0 8 0
9 0 0 0 0 0 0 0
reshape2::melt(confusionmat) %>%
ggplot(aes(x=predsrfprobsamp,y=Var2))+
geom_tile(aes(fill=value), colour = "white") +
geom_text(aes(label = sprintf("%1.0f", value)), vjust = 1)+
scale_fill_gradient(low = "blue", high = "red")+
xlab(" Predicted Activity ")+ylab(" Actual Activity")+
scale_y_discrete(limits=c('low','medium','high'))+
scale_x_discrete(limits=c('high','medium','low'))+
#scale_y_discrete(limits=c('three','six','seven','four','five','eight'))+
#scale_x_discrete(limits=c('eight','five','four','seven','six','three'))+
theme_bw()+ theme(legend.position = "none")